{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "../data/test.csv\n",
      "../data/train.csv\n",
      "../data/ml_submission.csv\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "for dirname, _, filenames in os.walk('../data/'):\n",
    "    for filename in filenames:\n",
    "        print(os.path.join(dirname, filename))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/yphacker/opt/anaconda3/envs/py36/lib/python3.6/site-packages/ipykernel_launcher.py:3: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "# 一些常规特征\n",
    "import pandas as pd\n",
    "from tqdm.autonotebook import *\n",
    "from bs4 import BeautifulSoup\n",
    "import re\n",
    "\n",
    "tqdm.pandas()\n",
    "\n",
    "train = pd.read_csv('../data/train.csv')\n",
    "test = pd.read_csv('../data/test.csv')\n",
    "\n",
    "data = pd.concat([train, test], axis=0, sort=False).reset_index(drop=True)\n",
    "data = data.fillna(-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def salary_range_min(row):\n",
    "    try:\n",
    "        result = int(str(row['salary_range']).split('-')[0])\n",
    "    except Exception:\n",
    "        result = -1\n",
    "    return result\n",
    "\n",
    "def salary_range_max(row):\n",
    "    try:\n",
    "        result = int(str(row['salary_range']).split('-')[1])\n",
    "    except Exception:\n",
    "        result = -1\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c20d5ce826b844d5ace611961ded1424",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, max=17880.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5f04dda3fa434bb3a1606657cd06ce51",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, max=17880.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a37e66aef3d4467dbf2b534bbf31dfda",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, max=17880.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "normal_feature = pd.DataFrame()\n",
    "normal_feature['salary_min'] = data.progress_apply(lambda row:salary_range_min(row), axis=1)\n",
    "normal_feature['salary_max'] = data.progress_apply(lambda row:salary_range_max(row), axis=1)\n",
    "normal_feature['salary_median'] = (normal_feature['salary_max'] + normal_feature['salary_min'])/2\n",
    "normal_feature['salary_range'] = normal_feature['salary_max'] - normal_feature['salary_min']\n",
    "normal_feature['telecommuting'] = list(data['telecommuting'])\n",
    "normal_feature['has_company_logo'] = list(data['has_company_logo'])\n",
    "normal_feature['has_questions'] = list(data['has_questions'])\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "labelencoder = LabelEncoder()\n",
    "normal_feature['employment_type'] = labelencoder.fit_transform(data['employment_type'].astype(str))\n",
    "normal_feature['required_experience'] = labelencoder.fit_transform(data['required_experience'].astype(str))\n",
    "normal_feature['required_education'] = labelencoder.fit_transform(data['required_education'].astype(str))\n",
    "normal_feature['industry'] = labelencoder.fit_transform(data['industry'].astype(str))\n",
    "normal_feature['function'] = labelencoder.fit_transform(data['function'].astype(str))\n",
    "\n",
    "data['review'] = data.progress_apply(lambda row:str(row['title']) + ' ' + str(row['location']) + ' ' + str(row['company_profile']) + ' ' + \n",
    "                                   str(row['description']) + ' ' + str(row['department']) + ' ' + str(row['requirements']) + ' ' + str(row['benefits']), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "开始计算tf-idf特征\n",
      "计算结束\n",
      "开始进行一些前期处理\n",
      "处理完毕\n",
      "\n",
      "****开始跑 LogisticRegression ****\n",
      "LogisticRegression 处理完毕\n",
      "五折结果 [0.97936, 0.98049, 0.97738, 0.98473, 0.97963]\n",
      "平均结果 0.9803179999999999\n",
      "\n",
      "****开始跑 SGDClassifier ****\n",
      "SGDClassifier 处理完毕\n",
      "五折结果 [0.98926, 0.99067, 0.98727, 0.99123, 0.98868]\n",
      "平均结果 0.989422\n",
      "\n",
      "****开始跑 PassiveAggressiveClassifier ****\n",
      "PassiveAggressiveClassifier 处理完毕\n",
      "五折结果 [0.98954, 0.98954, 0.98699, 0.99265, 0.9901]\n",
      "平均结果 0.9897640000000001\n",
      "\n",
      "****开始跑 RidgeClassfiy ****\n",
      "RidgeClassfiy 处理完毕\n",
      "五折结果 [0.98332, 0.98501, 0.98275, 0.98982, 0.98359]\n",
      "平均结果 0.984898\n",
      "\n",
      "****开始跑 LinearSVC ****\n",
      "LinearSVC 处理完毕\n",
      "五折结果 [0.98699, 0.98812, 0.98558, 0.99152, 0.9867]\n",
      "平均结果 0.9877819999999999\n"
     ]
    }
   ],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "import jieba\n",
    "from tqdm import *\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "import numpy as np\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "from sklearn.linear_model import PassiveAggressiveClassifier\n",
    "from sklearn.linear_model import RidgeClassifier\n",
    "from sklearn.naive_bayes import BernoulliNB\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "\n",
    "df_train = data[:len(train)]\n",
    "df_test = data[len(train):]\n",
    "\n",
    "df_train['label'] = df_train['fraudulent'].astype(int)\n",
    "data = pd.concat([df_train, df_test], axis=0, sort=False)\n",
    "data['review'] = data['review'].apply(lambda row:str(row))\n",
    "\n",
    "############################ tf-idf ############################\n",
    "print('开始计算tf-idf特征')\n",
    "tf = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1)\n",
    "discuss_tf = tf.fit_transform(data['review']).tocsr()\n",
    "print('计算结束')\n",
    "\n",
    "############################ 切分数据集 ##########################\n",
    "print('开始进行一些前期处理')\n",
    "train_feature = discuss_tf[:len(df_train)]\n",
    "score = df_train['label']\n",
    "test_feature = discuss_tf[len(df_train):]\n",
    "print('处理完毕')\n",
    "\n",
    "######################### 模型函数(返回sklean_stacking结果) ########################\n",
    "def get_sklearn_classfiy_stacking(clf, train_feature, test_feature, score, model_name, class_number, n_folds, train_num, test_num):\n",
    "    print('\\n****开始跑', model_name, '****')\n",
    "    stack_train = np.zeros((train_num, class_number))\n",
    "    stack_test = np.zeros((test_num, class_number))\n",
    "    score_mean = []\n",
    "    skf = StratifiedKFold(n_splits=n_folds, random_state=42)\n",
    "    tqdm.desc = model_name\n",
    "    for i, (tr, va) in enumerate(skf.split(train_feature, score)):\n",
    "        clf.fit(train_feature[tr], score[tr])\n",
    "        score_va = clf._predict_proba_lr(train_feature[va])\n",
    "        score_te = clf._predict_proba_lr(test_feature)\n",
    "        score_single = accuracy_score(score[va], clf.predict(train_feature[va]))\n",
    "        score_mean.append(np.around(score_single, 5))\n",
    "        stack_train[va] += score_va\n",
    "        stack_test += score_te\n",
    "    stack_test /= n_folds\n",
    "    stack = np.vstack([stack_train, stack_test])\n",
    "    df_stack = pd.DataFrame()\n",
    "    df_stack['tfidf_' + model_name + '_classfiy_{}'.format(1)] = stack[:, 1]\n",
    "    print(model_name, '处理完毕')\n",
    "    return df_stack, score_mean\n",
    "\n",
    "model_list = [\n",
    "    ['LogisticRegression', LogisticRegression(random_state=42, C=3)],\n",
    "    ['SGDClassifier', SGDClassifier(random_state=42, loss='perceptron')],\n",
    "    ['PassiveAggressiveClassifier', PassiveAggressiveClassifier(random_state=42, C=2)],\n",
    "    ['RidgeClassfiy', RidgeClassifier(random_state=42)],\n",
    "    ['LinearSVC', LinearSVC(random_state=42)]\n",
    "]\n",
    "\n",
    "stack_feature = pd.DataFrame()\n",
    "for i in model_list:\n",
    "    stack_result, score_mean = get_sklearn_classfiy_stacking(i[1], train_feature, test_feature, score, i[0], 2, 5, len(df_train), len(df_test))\n",
    "    stack_feature = pd.concat([stack_feature, stack_result], axis=1, sort=False)\n",
    "    print('五折结果', score_mean)\n",
    "    print('平均结果', np.mean(score_mean))\n",
    "normal_feature = pd.concat([stack_feature, normal_feature], axis=1, sort=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "train = pd.read_csv('../data/train.csv')\n",
    "test = pd.read_csv('../data/test.csv')\n",
    "\n",
    "\n",
    "train_feature = normal_feature[:len(train)]\n",
    "test_feature = normal_feature[len(train):]\n",
    "\n",
    "train_feature['label'] = train['fraudulent'].astype(int)\n",
    "not_cols = ['label']\n",
    "cols = [col for col in train_feature.columns if col not in not_cols]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\tvalid_0's binary_logloss: 0.0404973\n",
      "[1000]\tvalid_0's binary_logloss: 0.0286302\n",
      "[1500]\tvalid_0's binary_logloss: 0.0264742\n",
      "[2000]\tvalid_0's binary_logloss: 0.0267933\n",
      "Early stopping, best iteration is:\n",
      "[1572]\tvalid_0's binary_logloss: 0.0263894\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\tvalid_0's binary_logloss: 0.0403522\n",
      "[1000]\tvalid_0's binary_logloss: 0.0272066\n",
      "[1500]\tvalid_0's binary_logloss: 0.0246409\n",
      "[2000]\tvalid_0's binary_logloss: 0.0245345\n",
      "Early stopping, best iteration is:\n",
      "[1870]\tvalid_0's binary_logloss: 0.0244055\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\tvalid_0's binary_logloss: 0.0459013\n",
      "[1000]\tvalid_0's binary_logloss: 0.0339759\n",
      "[1500]\tvalid_0's binary_logloss: 0.0328021\n",
      "Early stopping, best iteration is:\n",
      "[1446]\tvalid_0's binary_logloss: 0.0327748\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\tvalid_0's binary_logloss: 0.0420373\n",
      "[1000]\tvalid_0's binary_logloss: 0.0291567\n",
      "[1500]\tvalid_0's binary_logloss: 0.0268898\n",
      "[2000]\tvalid_0's binary_logloss: 0.0270719\n",
      "Early stopping, best iteration is:\n",
      "[1665]\tvalid_0's binary_logloss: 0.026734\n",
      "Training until validation scores don't improve for 500 rounds.\n",
      "[500]\tvalid_0's binary_logloss: 0.0494006\n",
      "[1000]\tvalid_0's binary_logloss: 0.0384844\n",
      "[1500]\tvalid_0's binary_logloss: 0.037595\n",
      "Early stopping, best iteration is:\n",
      "[1345]\tvalid_0's binary_logloss: 0.0373089\n",
      "                                           names   imp\n",
      "1                 tfidf_SGDClassifier_classfiy_1  8723\n",
      "3                 tfidf_RidgeClassfiy_classfiy_1  8529\n",
      "0            tfidf_LogisticRegression_classfiy_1  8409\n",
      "2   tfidf_PassiveAggressiveClassifier_classfiy_1  8049\n",
      "4                     tfidf_LinearSVC_classfiy_1  5377\n",
      "15                                      industry  5328\n",
      "16                                      function  2670\n",
      "12                               employment_type  2538\n",
      "8                                   salary_range  2459\n",
      "13                           required_experience  2413\n",
      "10                              has_company_logo  1869\n",
      "14                            required_education  1508\n",
      "5                                     salary_min  1290\n",
      "6                                     salary_max   944\n",
      "7                                  salary_median   495\n",
      "9                                  telecommuting   158\n",
      "11                                 has_questions    65\n",
      "5 Fold result: [0.9923664122137404, 0.9920814479638009, 0.9909502262443439, 0.9926470588235294, 0.9889674681753889]\n",
      "mean result: 0.9914025226841607\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import accuracy_score\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import gc\n",
    "from sklearn.model_selection import KFold,StratifiedKFold\n",
    "import lightgbm as lgb\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "import os\n",
    "version = 'lgb_model_normal_feature'\n",
    "\n",
    "def evaluate_5_fold(train_df, test_df, cols, test=False):\n",
    "    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1017)\n",
    "    y_test = 0\n",
    "    oof_train = np.zeros((train_df.shape[0], 1))\n",
    "    mertics_front = []\n",
    "    for i, (train_index, val_index) in enumerate(kf.split(train_df[cols],train_df.label)):\n",
    "        X_train, y_train = train_df.loc[train_index, cols], train_df.label.values[train_index]\n",
    "        X_val, y_val = train_df.loc[val_index, cols], train_df.label.values[val_index]\n",
    "\n",
    "        lgb_train = lgb.Dataset(\n",
    "            X_train, y_train)\n",
    "        lgb_eval = lgb.Dataset(\n",
    "            X_val, y_val,\n",
    "            reference=lgb_train)\n",
    "        params = {\n",
    "            'boosting_type': 'gbdt',\n",
    "            'learning_rate' : 0.003, \n",
    "            'verbose': 0,\n",
    "            'objective':'binary',\n",
    "            'seed': 1024,\n",
    "            'nthread': 50,\n",
    "            'subsample': 0.85,\n",
    "            'colsample_bytree': 0.85,\n",
    "            'reg_alpha': 0.3,\n",
    "            'reg_lamdba': 0.243,\n",
    "            'num_leaves': 512,\n",
    "        }\n",
    "        gbm = lgb.train(params,\n",
    "                        lgb_train,\n",
    "                        valid_sets=lgb_eval,\n",
    "                        num_boost_round=10000,\n",
    "                        early_stopping_rounds=500,\n",
    "                        verbose_eval=500,\n",
    "                        )\n",
    "        y_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)\n",
    "        if test:\n",
    "            y_test += gbm.predict(test_df[cols], num_iteration=gbm.best_iteration)\n",
    "        oof_train[val_index] = np.array(y_pred).reshape(len(val_index), 1)\n",
    "        mertics_front.append(accuracy_score(train_df.label.values[val_index], np.around(y_pred)))\n",
    "    y_test/= 5\n",
    "    feature_list = pd.DataFrame()\n",
    "    feature_list['names'] = cols\n",
    "    feature_list['imp'] = gbm.feature_importance()\n",
    "    feature_list = feature_list.sort_values(by='imp', ascending=False)\n",
    "    print(feature_list)\n",
    "    print('5 Fold result:', mertics_front)\n",
    "    print('mean result:', np.mean(mertics_front))\n",
    "    gc.collect()\n",
    "    return mertics_front, oof_train, y_test, feature_list\n",
    "f_score, oof_train, y_test, imp = evaluate_5_fold(train_feature, test_feature, cols, True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_change_label = y_test.copy()\n",
    "test_change_label[test_change_label >= 0.05] = 1\n",
    "test_change_label[test_change_label < 0.05] = 0\n",
    "result = pd.DataFrame()\n",
    "result['id'] = np.arange(0, len(y_test), 1)\n",
    "result['result'] = np.around(test_change_label)\n",
    "result['result'] = result['result'].astype(int)\n",
    "result.to_csv('lgb_baseline.csv', index=False, header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    104\n",
       "1     96\n",
       "Name: result, dtype: int64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result.result.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
